Todo:

library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
-- Attaching packages ----------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2     v purrr   0.3.4
v tibble  3.1.0     v dplyr   1.0.2
v tidyr   1.1.2     v stringr 1.4.0
v readr   1.4.0     v forcats 0.5.0
package 㤼㸱tibble㤼㸲 was built under R version 4.0.4-- Conflicts -------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(readxl)
library(plotly)
package 㤼㸱plotly㤼㸲 was built under R version 4.0.5
Attaching package: 㤼㸱plotly㤼㸲

The following object is masked from 㤼㸱package:ggplot2㤼㸲:

    last_plot

The following object is masked from 㤼㸱package:stats㤼㸲:

    filter

The following object is masked from 㤼㸱package:graphics㤼㸲:

    layout
ghg_emissions_clean <- ghg_emissions_data %>% 
  janitor::clean_names() %>% 
  mutate(units = "megatonnes of co2 equivelant") %>% 
  rename(emissions = emissions_mt_co2e) %>% 
  filter(emission_year != "BaseYear") %>% 
  mutate(emission_year = as.numeric(emission_year)) %>% 
  select(ccp_mapping, source_name, pollutant, year = emission_year, value = 
           emissions, units)
ghg_emissions_clean %>% 
  write_csv("data/clean_data/ghg_emissions.csv")
plotly::ggplotly(
  ghg_emissions_clean %>% 
    filter(year == 2018) %>%
    filter(pollutant %in% c("CO2", "CH4")) %>%
    ggplot() +
    aes(x = factor(ccp_mapping, levels = rev(levels(factor(ccp_mapping)))),
        y = value, fill = source_name,
        text = paste0('</br> Sector: ', ccp_mapping,
                      '</br> Emissions: ', value,
                      '</br> Source Name: ', source_name)) +
    geom_col(position = "stack") +
    theme_bw() +
    theme(legend.position = "none") +
    labs(x = "Sector",
         y = paste0("Emissions (", ghg_emissions_clean$units[1], ")")) +
    coord_flip(),
    tooltip = 'text'
  )
NA
ghg_emissions_data %>% 
  names()
[1] "National Communication Categories" "SG Source Sector"                  "CCP mapping"                       "IPCC"                             
[5] "SourceName"                        "Pollutant"                         "EmissionYear"                      "Emissions (MtCO2e)"               
ghg_emissions_data %>% 
  select()
input <- list()

input$col_choice = "national_communication_categories"
ghg_emissions_clean %>%
  group_by_(input$col_choice, "emission_year") %>% 
  summarise(total_ghg_emissions = sum(emissions))
`summarise()` regrouping output by 'national_communication_categories' (override with `.groups` argument)
ghg_emissions_data %>% 
  distinct(`National Communication Categories`)
ghg_emissions_data %>% 
  filter(EmissionYear != "BaseYear") %>% 
  mutate(EmissionYear = as.numeric(EmissionYear)) %>% 
  group_by(EmissionYear) %>% 
  summarise(total_ghg_emissions = sum(`Emissions (MtCO2e)`)) %>% 
  ggplot() +
  aes(x = EmissionYear, y = total_ghg_emissions) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = seq(1990,2020,5)) +
  ylim(0, 80) +
  theme(legend.position = 0) +
  theme_bw()
`summarise()` ungrouping output (override with `.groups` argument)

ghg_emissions_data %>% 
  distinct(`CCP mapping`)
ghg_emissions_data %>% 
  distinct(`National Communication Categories`)
ghg_emissions_data %>% 
  filter(EmissionYear != "BaseYear") %>% 
  mutate(EmissionYear = as.numeric(EmissionYear)) %>% 
  group_by(`CCP mapping`, EmissionYear) %>% 
  summarise(total_ghg_emissions = sum(`Emissions (MtCO2e)`)) %>% 
  ggplot() +
  aes(x = EmissionYear, y = total_ghg_emissions, group = `CCP mapping`, colour = `CCP mapping`) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = seq(1990,2020,5))
`summarise()` regrouping output by 'CCP mapping' (override with `.groups` argument)

ghg_emissions_data %>% 
  filter(EmissionYear != "BaseYear") %>% 
  mutate(EmissionYear = as.numeric(EmissionYear)) %>% 
  filter(`National Communication Categories` != `CCP mapping`) %>% 
  select(`National Communication Categories`, `CCP mapping`) %>% 
  unique()

category_id,category_name,subcategory_id,subcategory_name,year,emissions,emission

emissions_sankey <- emissions_data %>% 
  select(ccp_mapping, source_name, pollutant, emission_year, emissions, units)
filtered_df <- ghg_emissions_clean %>% 
  select(ccp_mapping, source_name, pollutant, emission_year, emissions, units) %>% 
  filter(pollutant == "CO2") %>% 
  filter(emission_year == "2005")
total_emissions_for_gas <- filtered_df %>% 
  summarise(sum(emissions)) %>% 
  pull()

total_emissions_by_category <- filtered_df %>% 
  select(-pollutant) %>% 
  group_by(ccp_mapping) %>% 
  summarise(cat_sum = sum(emissions), .groups = 'drop_last')
categories <- filtered_df %>% 
  distinct(ccp_mapping) %>% 
  pull()

n_categories <- length(categories)

sources <- filtered_df %>% 
  distinct(source_name) %>% 
  pull()

n_sources <- length(sources)
n_sources
[1] 159
node_names <- c("Total", categories, sources, "Other")

node_names_df <- data.frame("name" = node_names)

total_sankey_tibble <- total_emissions_by_category %>%
  mutate(total = "Total") %>% 
  mutate(total = match(total, node_names) -1) %>% 
  mutate(ccp_mapping = match(ccp_mapping, node_names) -1) %>% 
  select(source = total,
         target = ccp_mapping,
         value = cat_sum)

total_filtered_emissions <- total_sankey_tibble %>% 
  summarise(sum(value)) %>% 
  pull()

other_emissions <- total_emissions_for_gas - total_filtered_emissions

total_other_sankey_tibble <- tibble(
  "source" = c(0),
  "target" = (match("Other", node_names) -1),
  "value" = c(other_emissions)
)


sub_sankey_tibble <- filtered_df %>% 
  select(- c(units, pollutant, emission_year)) %>% 
  mutate(ccp_mapping = match(ccp_mapping, node_names) -1,
         source_name = match(source_name, node_names) -1)

names(sub_sankey_tibble) = c("source", "target", "value")

sankey_tibble <- total_sankey_tibble %>% 
  bind_rows(sub_sankey_tibble) %>% 
  bind_rows(total_other_sankey_tibble)

links_matrix <- data.frame(as.matrix(sankey_tibble, byrow = TRUE, ncols = 3))

# Add a 'group' column to each connection:
links <- links_matrix %>% 
  mutate(group = case_when(
    source == 0 ~ paste("type_", target, sep = ""),
    source!=0 ~ paste("type_", source, sep = "")
  ))

nodes <- node_names_df
# Add a 'group' column to each node.
# All of them in the same group to make them the same colour
nodes$group <- as.factor(c("my_unique_group"))

emissions <- list()

emissions$nodes <- nodes
emissions$links <- links
total_filtered_emissions <- total_sankey_tibble %>% 
  summarise(sum(value)) %>% 
  pull()

other_emissions <- total_emissions_for_gas - total_filtered_emissions

total_other_sankey_tibble <- tibble(
  "source" = c(0),
  "target" = (match("Other", node_names) -1),
  "value" = c(other_emissions)
)

sub_sankey_tibble <- filtered_tibble %>% 
  select(-category_id, -subcategory_id, -year) %>% 
  mutate(category_name = match(category_name, node_names) -1,
         subcategory_name = match(subcategory_name, node_names) -1)

names(sub_sankey_tibble) = c("source", "target", "value")

sankey_tibble <- total_sankey_tibble %>% 
  bind_rows(sub_sankey_tibble) %>% 
  bind_rows(total_other_sankey_tibble)

links_matrix <- data.frame(as.matrix(sankey_tibble, byrow = TRUE, ncols = 3))

# Add a 'group' column to each connection:
links <- links_matrix %>% 
  mutate(group = case_when(
    source == 0 ~ paste("type_", target, sep = ""),
    source!=0 ~ paste("type_", source, sep = "")
  ))

nodes <- node_names_df
# Add a 'group' column to each node.
# All of them in the same group to make them the same colour
nodes$group <- as.factor(c("my_unique_group"))

emissions <- list()

emissions$nodes <- nodes
emissions$links <- links
make_sankey_dfs <- function(data, userYear, userGas) {
  n_categories <- data %>% 
    distinct(category_name) %>% 
    nrow()
  
  total_emissions_for_gas <- data %>% 
    filter(emission == userGas()) %>% 
    filter(year == userYear()) %>%
    summarise(sum(emissions)) %>% 
    pull()
  
  filtered_tibble <- data %>%
    filter(emission == userGas()) %>% 
    select(-emission) %>% 
    filter(year == userYear()) %>% 
    filter(emissions > userResolution())
  
  total_emissions_by_cat <- filtered_tibble %>%
    group_by(category_name) %>% 
    summarise(cat_sum = sum(emissions), .groups = 'drop_last')
  
  categories <- filtered_tibble %>%
    distinct(category_name) %>% 
    pull()
  
  subcategories <- filtered_tibble %>%
    distinct(subcategory_name) %>% 
    pull()
  
  node_names <- c("Total", categories, subcategories, "Other")
  
  node_names_df <- data.frame("name" = node_names)
  
  total_sankey_tibble <- total_emissions_by_cat %>%
    mutate(total = "Total") %>% 
    mutate(total = match(total, node_names) -1) %>% 
    mutate(category_name = match(category_name, node_names) -1) %>% 
    select(source = total,
           target = category_name,
           value = cat_sum)
  
  total_filtered_emissions <- total_sankey_tibble %>% 
    summarise(sum(value)) %>% 
    pull()
  
  other_emissions <- total_emissions_for_gas - total_filtered_emissions
  
  total_other_sankey_tibble <- tibble(
    "source" = c(0),
    "target" = (match("Other", node_names) -1),
    "value" = c(other_emissions)
  )
  
  sub_sankey_tibble <- filtered_tibble %>% 
    select(-category_id, -subcategory_id, -year) %>% 
    mutate(category_name = match(category_name, node_names) -1,
           subcategory_name = match(subcategory_name, node_names) -1)
  
  names(sub_sankey_tibble) = c("source", "target", "value")
  
  sankey_tibble <- total_sankey_tibble %>% 
    bind_rows(sub_sankey_tibble) %>% 
    bind_rows(total_other_sankey_tibble)
  
  links_matrix <- data.frame(as.matrix(sankey_tibble, byrow = TRUE, ncols = 3))
  
  # Add a 'group' column to each connection:
  links <- links_matrix %>% 
    mutate(group = case_when(
      source == 0 ~ paste("type_", target, sep = ""),
      source!=0 ~ paste("type_", source, sep = "")
    ))
  
  nodes <- node_names_df
  # Add a 'group' column to each node.
  # All of them in the same group to make them the same colour
  nodes$group <- as.factor(c("my_unique_group"))
  
  emissions <- list()
  
  emissions$nodes <- nodes
  emissions$links <- links
  
  return(emissions)
}
make_sankey_dfs(emissions_sankey, userYear = 2005, userGas = "CH4", userResolution = 50)
---
title: "R Notebook"
output: html_notebook
---
# Todo:

- add columns for totals for each pollutant/sector

```{r}
library(tidyverse)
library(readxl)
library(plotly)
```

```{r}
ghg_emissions_data <- read_xlsx("data/raw_data/scottish-ghg-dataset-2018.xlsx", 2)
```

```{r}
ghg_emissions_clean <- ghg_emissions_data %>% 
  janitor::clean_names() %>% 
  mutate(units = "megatonnes of co2 equivelant") %>% 
  rename(emissions = emissions_mt_co2e) %>% 
  filter(emission_year != "BaseYear") %>% 
  mutate(emission_year = as.numeric(emission_year)) %>% 
  select(ccp_mapping, source_name, pollutant, year = emission_year, value = 
           emissions, units)
```

```{r}
ghg_emissions_clean %>% 
  write_csv("data/clean_data/ghg_emissions.csv")
```


```{r}
plotly::ggplotly(
  ghg_emissions_clean %>% 
    filter(year == 2018) %>%
    filter(pollutant %in% c("CO2", "CH4")) %>%
    ggplot() +
    aes(x = factor(ccp_mapping, levels = rev(levels(factor(ccp_mapping)))),
        y = value, fill = source_name,
        text = paste0('</br> Sector: ', ccp_mapping,
                      '</br> Emissions: ', value,
                      '</br> Source Name: ', source_name)) +
    geom_col(position = "stack") +
    theme_bw() +
    theme(legend.position = "none") +
    labs(x = "Sector",
         y = paste0("Emissions (", ghg_emissions_clean$units[1], ")")) +
    coord_flip(),
    tooltip = 'text'
  )

```


```{r}
ghg_emissions_data %>% 
  names()
```

```{r}
ghg_emissions_data %>% 
  select()
```



```{r}
input <- list()

input$col_choice = "national_communication_categories"
```



```{r}
ghg_emissions_clean %>%
  group_by_(input$col_choice, "emission_year") %>% 
  summarise(total_ghg_emissions = sum(emissions)) %>% 
  ggplot() +
  aes(x = EmissionYear, y = total_ghg_emissions, group = `National Communication Categories`, colour = `National Communication Categories`) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = seq(1990,2020,5)) +
  theme(legend.position = 0)
```

```{r}
ghg_emissions_data %>% 
  distinct(`National Communication Categories`)
```
```{r}
ghg_emissions_data %>% 
  filter(EmissionYear != "BaseYear") %>% 
  mutate(EmissionYear = as.numeric(EmissionYear)) %>% 
  group_by(EmissionYear) %>% 
  summarise(total_ghg_emissions = sum(`Emissions (MtCO2e)`)) %>% 
  ggplot() +
  aes(x = EmissionYear, y = total_ghg_emissions) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = seq(1990,2020,5)) +
  ylim(0, 80) +
  theme(legend.position = 0) +
  theme_bw()
```


```{r}
ghg_emissions_data %>% 
  distinct(`CCP mapping`)
```

```{r}

```


```{r}
ghg_emissions_data %>% 
  distinct(`National Communication Categories`)
```
```{r}
ghg_emissions_data
```
```{r}
ghg_emissions_data %>% 
  filter(EmissionYear != "BaseYear") %>% 
  mutate(EmissionYear = as.numeric(EmissionYear)) %>% 
  group_by(`CCP mapping`, EmissionYear) %>% 
  summarise(total_ghg_emissions = sum(`Emissions (MtCO2e)`)) %>% 
  ggplot() +
  aes(x = EmissionYear, y = total_ghg_emissions, group = `CCP mapping`, colour = `CCP mapping`) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = seq(1990,2020,5))
```

```{r}
ghg_emissions_data %>% 
  filter(EmissionYear != "BaseYear") %>% 
  mutate(EmissionYear = as.numeric(EmissionYear)) %>% 
  filter(`National Communication Categories` != `CCP mapping`) %>% 
  select(`National Communication Categories`, `CCP mapping`) %>% 
  unique()
```
category_id,category_name,subcategory_id,subcategory_name,year,emissions,emission

```{r}
emissions_sankey <- emissions_data %>% 
  select(ccp_mapping, source_name, pollutant, emission_year, emissions, units)
```


```{r}
ghg_emissions_clean
```


```{r}
filtered_df <- ghg_emissions_clean %>% 
  select(ccp_mapping, source_name, pollutant, emission_year, emissions, units) %>% 
  filter(pollutant == "CO2") %>% 
  filter(emission_year == "2005")
```

```{r}
total_emissions_for_gas <- filtered_df %>% 
  summarise(sum(emissions)) %>% 
  pull()

total_emissions_by_category <- filtered_df %>% 
  select(-pollutant) %>% 
  group_by(ccp_mapping) %>% 
  summarise(cat_sum = sum(emissions), .groups = 'drop_last')
```

```{r}
categories <- filtered_df %>% 
  distinct(ccp_mapping) %>% 
  pull()

n_categories <- length(categories)

sources <- filtered_df %>% 
  distinct(source_name) %>% 
  pull()

n_sources <- length(sources)
```

```{r}
n_sources
```


```{r}
node_names <- c("Total", categories, sources, "Other")

node_names_df <- data.frame("name" = node_names)

total_sankey_tibble <- total_emissions_by_category %>%
  mutate(total = "Total") %>% 
  mutate(total = match(total, node_names) -1) %>% 
  mutate(ccp_mapping = match(ccp_mapping, node_names) -1) %>% 
  select(source = total,
         target = ccp_mapping,
         value = cat_sum)

total_filtered_emissions <- total_sankey_tibble %>% 
  summarise(sum(value)) %>% 
  pull()

other_emissions <- total_emissions_for_gas - total_filtered_emissions

total_other_sankey_tibble <- tibble(
  "source" = c(0),
  "target" = (match("Other", node_names) -1),
  "value" = c(other_emissions)
)


sub_sankey_tibble <- filtered_df %>% 
  select(- c(units, pollutant, emission_year)) %>% 
  mutate(ccp_mapping = match(ccp_mapping, node_names) -1,
         source_name = match(source_name, node_names) -1)

names(sub_sankey_tibble) = c("source", "target", "value")

sankey_tibble <- total_sankey_tibble %>% 
  bind_rows(sub_sankey_tibble) %>% 
  bind_rows(total_other_sankey_tibble)

links_matrix <- data.frame(as.matrix(sankey_tibble, byrow = TRUE, ncols = 3))

# Add a 'group' column to each connection:
links <- links_matrix %>% 
  mutate(group = case_when(
    source == 0 ~ paste("type_", target, sep = ""),
    source!=0 ~ paste("type_", source, sep = "")
  ))

nodes <- node_names_df
# Add a 'group' column to each node.
# All of them in the same group to make them the same colour
nodes$group <- as.factor(c("my_unique_group"))

emissions <- list()

emissions$nodes <- nodes
emissions$links <- links


```






```{r}
total_filtered_emissions <- total_sankey_tibble %>% 
  summarise(sum(value)) %>% 
  pull()

other_emissions <- total_emissions_for_gas - total_filtered_emissions

total_other_sankey_tibble <- tibble(
  "source" = c(0),
  "target" = (match("Other", node_names) -1),
  "value" = c(other_emissions)
)

sub_sankey_tibble <- filtered_tibble %>% 
  select(-category_id, -subcategory_id, -year) %>% 
  mutate(category_name = match(category_name, node_names) -1,
         subcategory_name = match(subcategory_name, node_names) -1)

names(sub_sankey_tibble) = c("source", "target", "value")

sankey_tibble <- total_sankey_tibble %>% 
  bind_rows(sub_sankey_tibble) %>% 
  bind_rows(total_other_sankey_tibble)

links_matrix <- data.frame(as.matrix(sankey_tibble, byrow = TRUE, ncols = 3))

# Add a 'group' column to each connection:
links <- links_matrix %>% 
  mutate(group = case_when(
    source == 0 ~ paste("type_", target, sep = ""),
    source!=0 ~ paste("type_", source, sep = "")
  ))

nodes <- node_names_df
# Add a 'group' column to each node.
# All of them in the same group to make them the same colour
nodes$group <- as.factor(c("my_unique_group"))

emissions <- list()

emissions$nodes <- nodes
emissions$links <- links
```








```{r}
make_sankey_dfs <- function(data, userYear, userGas) {
  n_categories <- data %>% 
    distinct(category_name) %>% 
    nrow()
  
  total_emissions_for_gas <- data %>% 
    filter(emission == userGas()) %>% 
    filter(year == userYear()) %>%
    summarise(sum(emissions)) %>% 
    pull()
  
  filtered_tibble <- data %>%
    filter(emission == userGas()) %>% 
    select(-emission) %>% 
    filter(year == userYear()) %>% 
    filter(emissions > userResolution())
  
  total_emissions_by_cat <- filtered_tibble %>%
    group_by(category_name) %>% 
    summarise(cat_sum = sum(emissions), .groups = 'drop_last')
  
  categories <- filtered_tibble %>%
    distinct(category_name) %>% 
    pull()
  
  subcategories <- filtered_tibble %>%
    distinct(subcategory_name) %>% 
    pull()
  
  node_names <- c("Total", categories, subcategories, "Other")
  
  node_names_df <- data.frame("name" = node_names)
  
  total_sankey_tibble <- total_emissions_by_cat %>%
    mutate(total = "Total") %>% 
    mutate(total = match(total, node_names) -1) %>% 
    mutate(category_name = match(category_name, node_names) -1) %>% 
    select(source = total,
           target = category_name,
           value = cat_sum)
  
  total_filtered_emissions <- total_sankey_tibble %>% 
    summarise(sum(value)) %>% 
    pull()
  
  other_emissions <- total_emissions_for_gas - total_filtered_emissions
  
  total_other_sankey_tibble <- tibble(
    "source" = c(0),
    "target" = (match("Other", node_names) -1),
    "value" = c(other_emissions)
  )
  
  sub_sankey_tibble <- filtered_tibble %>% 
    select(-category_id, -subcategory_id, -year) %>% 
    mutate(category_name = match(category_name, node_names) -1,
           subcategory_name = match(subcategory_name, node_names) -1)
  
  names(sub_sankey_tibble) = c("source", "target", "value")
  
  sankey_tibble <- total_sankey_tibble %>% 
    bind_rows(sub_sankey_tibble) %>% 
    bind_rows(total_other_sankey_tibble)
  
  links_matrix <- data.frame(as.matrix(sankey_tibble, byrow = TRUE, ncols = 3))
  
  # Add a 'group' column to each connection:
  links <- links_matrix %>% 
    mutate(group = case_when(
      source == 0 ~ paste("type_", target, sep = ""),
      source!=0 ~ paste("type_", source, sep = "")
    ))
  
  nodes <- node_names_df
  # Add a 'group' column to each node.
  # All of them in the same group to make them the same colour
  nodes$group <- as.factor(c("my_unique_group"))
  
  emissions <- list()
  
  emissions$nodes <- nodes
  emissions$links <- links
  
  return(emissions)
}
```


```{r}
make_sankey_dfs(emissions_sankey, userYear = 2005, userGas = "CH4", userResolution = 50)
```
```{r}

```

